Motivation:
Idea:
Notes:
def embedding(...):
for this approach.
In [300]:
import numpy as np
import pylab as pl
pl.rcParams['font.family']='Serif'
%matplotlib inline
In [301]:
from scipy.cluster import hierarchy
from scipy.spatial import distance
In [302]:
from sklearn import datasets
data = datasets.load_digits()
X = data.data
Y = data.target
In [14]:
c = hierarchy.single(X)
c.shape
Out[14]:
In [15]:
c[:10]
Out[15]:
In [16]:
np.max(c,0)
Out[16]:
In [17]:
import networkx as nx
In [18]:
pl.scatter(X[:,0],X[:,1],c=Y,linewidths=0)
Out[18]:
In [19]:
G = nx.Graph()
In [20]:
for i,(x,y,w,_) in enumerate(c):
#G.add_edge(int(x),int(y),weight=w)
#w=i*w
G.add_edge(int(x),int(i),weight=w)
G.add_edge(int(y),int(i),weight=w)
In [130]:
G.nodes()[:10],G.edges()[:10]
Out[130]:
In [131]:
nx.number_connected_components(G)
Out[131]:
In [132]:
cc = list(nx.connected_components(G))
len(cc)
Out[132]:
In [133]:
next_node = max(G.nodes())+1
In [134]:
[e['weight'] for e in nx.generate_edgelist(G)]
In [234]:
max_weight = max([G[x][y]['weight'] for (x,y) in G.edges()])
max_weight
Out[234]:
In [235]:
for component in cc:
G.add_edge(component[0],next_node,weight=max_weight*2)
In [236]:
nx.number_connected_components(G)
Out[236]:
In [237]:
d = nx.algorithms.floyd_warshall_numpy(G)
d.shape
Out[237]:
In [238]:
pl.imshow(d[:n,:n])
pl.colorbar()
Out[238]:
In [239]:
np.max(d[:n,:n]),np.min(d[:n,:n])
Out[239]:
In [9]:
In [241]:
pl.imshow(distance.squareform(distance.pdist(X)))
pl.colorbar()
Out[241]:
In [242]:
hierarchy.dendrogram(c)
In [243]:
full_labels = np.ones(len(G.nodes()))*-1
full_labels[:len(labels)] = labels
In [244]:
nx.draw_graphviz(G,node_size=10.0,node_color=full_labels,
linewidths=0)
In [26]:
d = distance.squareform(hierarchy.cophenet(c))
d.shape
Out[26]:
In [27]:
from sklearn.manifold import MDS
In [28]:
mds = MDS(dissimilarity='precomputed')
X_ = mds.fit_transform(d[:n,:n])
In [29]:
X_[:,0].shape,labels.shape
Out[29]:
In [30]:
pl.scatter(X_[:,0],X_[:,1],c=labels,
linewidths=0)
Out[30]:
In [ ]:
pl.scatter(X[:,0],X[:,1],c=labels,
linewidths=0)
In [31]:
from time import time
In [32]:
def embedding(X,labels=None,linkage='single',
profile=True):
''' (1) compute linkage matrix,
(2) compute pairwise distances using linkage matrix,
(3) compute embedding using distances'''
t = time()
n = len(X)
c = hierarchy.linkage(X,linkage)
d = distance.squareform(hierarchy.cophenet(c))
t1 = time()
print('Computed cophenetic distances in {0:.2f}s'.format(t1-t))
mds = MDS(dissimilarity='precomputed')
X_ = mds.fit_transform(d[:n,:n])
t2 = time()
print('Computed MDS embedding in {0:.2f}s'.format(t2-t1))
if labels==None:
pl.scatter(X_[:,0],X_[:,1])
else:
pl.scatter(X_[:,0],X_[:,1],c=labels,
linewidths=0)
return X_,c
In [108]:
data = datasets.load_digits()
X = data.data
Y = data.target
In [94]:
from sklearn.manifold import t_sne
t = t_sne.TSNE()
X_tsne = t.fit_transform(X)
In [95]:
pl.scatter(X_tsne[:,0],X_tsne[:,1],c=Y,cmap='rainbow')
Out[95]:
In [98]:
from sklearn import neighbors
In [99]:
one_nn_class_baseline(X_tsne,Y)
Out[99]:
In [100]:
one_nn_class_baseline(X,Y)
Out[100]:
In [109]:
c = hierarchy.linkage(X,'single')
d = distance.squareform(hierarchy.cophenet(c))
In [110]:
In [ ]:
In [89]:
t = t_sne.TSNE(metric='precomputed')
X_tsne = t.fit_transform(d)
In [90]:
pl.scatter(X_tsne[:,0],X_tsne[:,1],c=Y,cmap='rainbow')
Out[90]:
In [79]:
pl.imshow(d,cmap='Blues')
pl.colorbar()
sns.axes_style("white")
Out[79]:
In [148]:
actual_distance = distance.squareform(distance.pdist(X))
In [259]:
beta=0.01
#similarity = np.exp(-beta * d / (actual_distance+0.01))
alpha = 0.6
#similarity = (1-alpha)*np.exp(-beta * d / (d.std(0))) + (alpha/(1+actual_distance))
similarity = (1-alpha)*np.exp(-beta * d / (d.std(0))) + (alpha*np.exp(-beta*actual_distance))
In [259]:
In [260]:
from sklearn.manifold import SpectralEmbedding
se = KernelPCA(n_components=2,kernel='precomputed')
X_se = se.fit_transform(similarity)
pl.scatter(X_se[:,0],X_se[:,1],c=Y,cmap='rainbow')
pl.colorbar()
Out[260]:
In [240]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
pl.scatter(X_pca[:,0],X_pca[:,1],c=Y,cmap='rainbow')
pl.colorbar()
Out[240]:
In [261]:
import sklearn
sklearn.__version__
Out[261]:
In [241]:
one_nn_class_baseline(X_pca,Y),one_nn_class_baseline(X_se,Y)
Out[241]:
In [267]:
from scipy.spatial.distance import pdist,squareform
d = squareform(pdist(X,p=1))
r=0.5
(d<0.5).sum(0)
In [262]:
from sklearn.neighbors import radius_neighbors_graph
In [266]:
r = radius_neighbors_graph(X,0.1,p=1)
r
In [265]:
r.sum()
Out[265]:
In [242]:
from sklearn.decomposition import KernelPCA
In [163]:
kpca = KernelPCA(n_components=2,kernel='precomputed')
X_kpca = kpca.fit_transform(similarity)
pl.scatter(X_kpca[:,0],X_kpca[:,1],c=Y,cmap='rainbow')
Out[163]:
In [152]:
t = t_sne.TSNE(metric='precomputed')
X_tsne = t.fit_transform(similarity)
In [ ]:
pl.scatter(X_tsne[:,0],X_tsne[:,1],c=Y,cmap='rainbow')
In [144]:
In [113]:
d.std()
Out[113]:
In [ ]:
In [71]:
from sklearn.cluster.bicluster import SpectralBiclustering
model = SpectralBiclustering(n_clusters=10, method='log',
random_state=0)
In [105]:
model.fit(X)
In [106]:
fit_data = X[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]
In [107]:
pl.imshow(fit_data,cmap='Blues')
pl.colorbar()
Out[107]:
In [68]:
import seaborn as sns
sns.kdeplot(d.flatten())
Out[68]:
In [38]:
X_,c = embedding(X[:1000],Y[:1000])
In [39]:
data = datasets.load_iris()
X = data.data
Y = data.target
X.shape
Out[39]:
In [40]:
X_,c = embedding(X,Y)
In [50]:
data = datasets.load_diabetes()
X = data.data
Y = data.target
X.shape,Y.shape
Out[50]:
In [53]:
X_,c = embedding(X,Y,linkage='ward')
In [93]:
i = np.random.randint(len(X_))
i
Out[93]:
In [94]:
pl.scatter(X_[:,0],X_[:,1],c=Y,
linewidths=0,cmap='rainbow')
#pl.axis('off')
pl.colorbar()
pl.scatter(X_[i,0],X_[i,1],c='black',
linewidths=0,s=50)
Out[94]:
In [95]:
# index,label
examples = [(10,0),
(100,4),
(1694,7), #left
(543,7), #right
(1576,-1), #idk
(576,-1), #idk
]
In [111]:
examples = []
for i in range(25):
examples.append((np.random.randint(len(X_)),10))
In [112]:
from sklearn import neighbors
In [117]:
n2 = len(examples)
n = int(np.sqrt(n2))
for i,ex in enumerate(examples):
pl.subplot(n,n,i+1)
#pl.figure()
pl.axis('off')
#pl.title(ex[1])
pl.title(data.target[ex[0]],color='blue')
pl.imshow(data.images[ex[0]],cmap='gray',interpolation='none')
In [ ]:
In [ ]:
for index, (image, label) in enumerate(zip(digits.images, digits.target)[:4]):
pl.subplot(2, 4, index + 1)
pl.axis('off')
pl.imshow(image, cmap=pl.cm.gray_r, interpolation='nearest')
pl.title('Training: %i' % label)
In [140]:
from sklearn.decomposition import PCA
pca = PCA(2)
X_pca = pca.fit_transform(X)
pl.scatter(X_pca[:,0],X_pca[:,1],c=Y,
linewidths=0)
pl.axis('off')
pl.colorbar()
Out[140]:
In [303]:
#methods = ['single','complete','ward','average','weighted','centroid','median']
methods = ['single','complete','median']
In [304]:
data = datasets.load_digits()
X = data.data
Y = data.target
results = dict()
n=len(X)
In [305]:
from time import time
for method in methods:
t = time()
print(method)
pl.figure()
results[method] = embedding(X[:n],Y[:n],method)
pl.title
pl.savefig("approximate_tpe/" + method + ".pdf")
print(time() - t)
In [348]:
for method in methods:
pl.figure()
#results[method] = embedding(X[:n],Y[:n],method)
pl.title(method)
X_ = results[method][0]
pl.scatter(X_[:,0],X_[:,1],c=Y,cmap='jet',linewidths=0)
pl.savefig("approximate_tpe/" + method + ".pdf")
print(time() - t)
In [349]:
pl.figure()
#results[method] = embedding(X[:n],Y[:n],method)
pl.title('PCA')
X_ = X_pca
method='pca'
pl.scatter(X_[:,0],X_[:,1],c=Y,cmap='jet',linewidths=0)
pl.savefig("approximate_tpe/" + method + ".pdf")
In [346]:
.shape
Out[346]:
In [316]:
import matplotlib.pyplot as plt
import matplotlib as mpl
# setup the plot
fig, ax = plt.subplots(1,1, figsize=(6,6))
# define the data
x = np.random.rand(20)
y = np.random.rand(20)
tag = np.random.randint(0,10,20)
#tag[10:12] = 0 # make sure there are some 0 values to showup as grey
# define the colormap
cmap = plt.cm.jet
# extract all colors from the .jet map
cmaplist = [cmap(i) for i in range(cmap.N)]
# force the first color entry to be grey
#cmaplist[0] = (.5,.5,.5,1.0)
# create the new map
cmap = cmap.from_list('Custom cmap', cmaplist, cmap.N)
# define the bins and normalize
bounds = np.linspace(0,10,11)
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
# make the scatter
scat = ax.scatter(x,y,c=tag,s=np.random.randint(100,500,20),cmap=cmap, norm=norm)
# create a second axes for the colorbar
ax2 = fig.add_axes([0.95, 0.1, 0.03, 0.8])
cb = mpl.colorbar.ColorbarBase(ax2, cmap=cmap, norm=norm, spacing='proportional', ticks=bounds, boundaries=bounds, format='%1i')
ax.set_title('Well defined discrete colors')
#ax2.set_ylabel('Very custom cbar [-]', size=12)
plt.savefig('jet-cmap.jpg')
In [ ]:
from scipy
In [449]:
c = results['single'][1]
d = distance.squareform(hierarchy.cophenet(c))
In [373]:
np.min(d),np.max(d)
Out[373]:
In [384]:
pl.hist(hierarchy.cophenet(c),bins=50);
In [323]:
affinity = 1+np.max(d**2)-(d**2)
In [324]:
from sklearn.manifold import SpectralEmbedding
In [325]:
se = SpectralEmbedding(affinity='precomputed')
In [326]:
X_se = se.fit_transform(affinity)
In [327]:
pl.scatter(X_se[:,0],X_se[:,1],c=Y,
linewidths=0)
pl.axis('off')
pl.colorbar()
Out[327]:
In [404]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(2,kernel='precomputed')
In [329]:
X_kpca = kpca.fit_transform(affinity)
In [330]:
pl.scatter(X_kpca[:,0],X_kpca[:,1],c=Y,
linewidths=0)
pl.axis('off')
pl.colorbar()
Out[330]:
In [332]:
X_,c = results['single']
In [334]:
c_ = hierarchy.linkage(X_,'single')
In [336]:
hierarchy.dendrogram(c)
In [337]:
hierarchy.dendrogram(c_)
In [138]:
from sklearn import neighbors
def one_nn_baseline(X,Y):
one_nn_X = neighbors.kneighbors_graph(X,2)
one_nn_Y = neighbors.kneighbors_graph(Y,2)
sames = 0
for i in range(len(X)):
neighbor_X = one_nn_X[i].indices[one_nn_X[i].indices!=i][0]
neighbor_Y = one_nn_Y[i].indices[one_nn_Y[i].indices!=i][0]
if neighbor_X == neighbor_Y:
sames+=1
return 1.0*sames / len(X)
In [139]:
X_.shape,X.shape
Out[139]:
In [345]:
one_nn_baseline(X_,X),one_nn_baseline(X_pca,X)
Out[345]:
In [92]:
def knn_baseline(X,Y,k=5):
k = k+1 # since self is counted as a neighbor in the kneighbors graph
knn_X = neighbors.kneighbors_graph(X,k)
knn_Y = neighbors.kneighbors_graph(Y,k)
sames = 0
for i in range(len(X)):
neighbors_X = set(knn_X[i].indices[knn_X[i].indices!=i])
neighbors_Y = set(knn_Y[i].indices[knn_Y[i].indices!=i])
sames += len(neighbors_X.intersection(neighbors_Y))
return 1.0*sames / (len(X)*(k-1))
In [350]:
knn_baseline(X_,X,2),knn_baseline(X_pca,X,2)
Out[350]:
In [364]:
neighbors_n = range(1,20)[::2] + range(20,50)[::5]
In [365]:
knn_ = [knn_baseline(X_,X,i) for i in neighbors_n]
knn_pca = [knn_baseline(X_pca,X,i) for i in neighbors_n]
In [385]:
pl.plot(neighbors_n,knn_,label='Approx. TPE')
pl.plot(neighbors_n,knn_pca,label='2D PCA')
pl.legend(loc='best')
Out[385]:
In [306]:
def knn_baseline_curve(X,Y,ks=range(1,50)):
max_k = max(ks)+1 # since self is counted as a neighbor in the kneighbors graph
knn_X = neighbors.kneighbors_graph(X,max_k)
knn_Y = neighbors.kneighbors_graph(Y,max_k)
sames = np.zeros(len(ks))
for ind,k in enumerate(ks):
for i in range(len(X)):
neighbors_X = set(knn_X[i].indices[knn_X[i].indices!=i][:k])
neighbors_Y = set(knn_Y[i].indices[knn_Y[i].indices!=i][:k])
sames[ind] += len(neighbors_X.intersection(neighbors_Y))
sames[ind] /= (len(X)*(k))
return sames
In [398]:
knn_tpe = knn_baseline_curve(X_,X,neighbors_n)
knn_pca = knn_baseline_curve(X_pca,X,neighbors_n)
In [419]:
kpca = KernelPCA(2,'poly',degree=3)
X_kpca = kpca.fit_transform(X)
knn_kpca = knn_baseline_curve(X_kpca,X,neighbors_n)
In [420]:
pl.scatter(X_kpca[:,0],X_kpca[:,1],c=Y,linewidth=0)
Out[420]:
In [ ]:
In [ ]:
In [91]:
def one_nn_class_baseline(X,Y):
one_nn = neighbors.kneighbors_graph(X,2)
inds = np.zeros(len(X),dtype=int)
for i in range(len(X)):
inds[i] = [ind for ind in one_nn[i].indices if ind != i][0]
preds = Y[inds]
return 1.0*sum(preds==Y) / len(Y)
In [442]:
one_nn_class_baseline(X,Y)
Out[442]:
In [319]:
from sklearn.manifold import Isomap
iso = Isomap()
X_iso = iso.fit_transform(X)
from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding()
X_lle = lle.fit_transform(X)
In [108]:
pl.scatter(X_lle[:,0],X_lle[:,1],c=Y,
linewidths=0,cmap='rainbow')
#pl.axis('off')
pl.colorbar()
Out[108]:
In [454]:
vec = (one_nn_class_baseline(X_,Y),one_nn_class_baseline(X_pca,Y),one_nn_class_baseline(X_kpca,Y))
In [329]:
baselines = [one_nn_class_baseline(X_pca,Y)]#,
#one_nn_class_baseline(X_iso,Y)]
#one_nn_class_baseline(X_lle,Y)]
res = [one_nn_class_baseline(results[m][0],Y) for m in results]
vec = baselines + res
In [330]:
method_names = [m+'-TPE' for m in methods]
In [331]:
fig, ax = pl.subplots()
barlist = ax.bar(range(len(vec)),vec)
pl.hlines(one_nn_class_baseline(X,Y),0,len(vec),linestyles='--')
pl.xlabel('Algorithm')
pl.ylabel('1NN Classification Accuracy')
pl.title('1NN Classification in Low-Dimensional Embeddings')
baseline_names = ['PCA']#,'Isomap']#,'LLE']
pl.xticks(range(len(vec)), baseline_names + method_names,rotation=30)
#pl.ylim(0.25,1.0)
def autolabel(rects):
# attach some text labels
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2., height-0.075, '{0:.2f}'.format(height),
ha='center', va='bottom',color='white')
autolabel(barlist)
for i in range(len(baseline_names)):
barlist[i].set_color('gray')
for i in range(len(baseline_names),len(vec)):
barlist[i].set_color('blue')
pl.savefig('../figures/tpe-comparison.pdf')
In [501]:
vec[-1]
Out[501]:
In [502]:
one_nn_class_baseline(X,Y)
Out[502]:
In [150]:
neighbors_n = list(range(1,20))[::2] + list(range(20,50))[::5]
In [334]:
neighbors_n = list(range(1,20))[::1] + list(range(20,50))[::5] + list(range(50,300))[::10]# + list(range(300,1796))[::100]
knn_tpe = dict()
print('PCA')
knn_pca = [knn_baseline(X_pca,X,i) for i in neighbors_n]
#print('Iso')
#knn_iso = knn_baseline_curve(X_iso,X,neighbors_n)
#print('LLE')
#knn_lle = knn_baseline_curve(X_lle,X,neighbors_n)
for method in methods:
print(method)
knn_tpe[method] = [knn_baseline(results[method][0],X,i) for i in neighbors_n]
In [ ]:
In [335]:
pl.plot(neighbors_n,knn_pca,label='PCA',linestyle='--')
#pl.scatter(neighbors_n,knn_pca,linewidths=0)
#pl.plot(neighbors_n,knn_iso,label='Isomap',linestyle='--')
#pl.scatter(neighbors_n,knn_iso,linewidths=0)
#pl.plot(neighbors_n,knn_lle,label='LLE',linestyle='--')
#pl.scatter(neighbors_n,knn_lle,linewidths=0)
pl.legend(loc='best')
pl.xlabel('# Nearest Neighbors')
pl.ylabel('Fraction conserved')
pl.title('Neighborhood preservation')
Out[335]:
In [339]:
for method in methods:
name = method + '-TPE'
pl.plot(neighbors_n,knn_tpe[method],label=name,linewidth=2)
#pl.scatter(neighbors_n,knn_tpe,linewidths=0)
#pl.plot(neighbors_n,knn_medtpe,label='Median-TPE',linewidth=2)
pl.plot(neighbors_n,knn_pca,label='PCA',linestyle='--')
#pl.scatter(neighbors_n,knn_pca,linewidths=0)
#pl.plot(neighbors_n,knn_iso,label='Isomap',linestyle='--')
#pl.scatter(neighbors_n,knn_iso,linewidths=0)
#pl.plot(neighbors_n,knn_lle,label='LLE',linestyle='--')
#pl.scatter(neighbors_n,knn_lle,linewidths=0)
pl.legend(loc='best')
pl.xlabel('# Nearest Neighbors')
pl.ylabel('Fraction conserved')
pl.title('Neighborhood preservation')
#pl.xlim(0,max(neighbors_n))
pl.xlim(0,300)
pl.ylim(0,0.65)
pl.savefig('lol.pdf')
In [ ]:
In [ ]:
# examine first-derivatives of this?
In [541]:
pl.plot(neighbors_n,knn_tpe,label='Single-TPE',linewidth=2)
#pl.scatter(neighbors_n,knn_tpe,linewidths=0)
pl.plot(neighbors_n,knn_medtpe,label='Median-TPE',linewidth=2)
pl.plot(neighbors_n,knn_pca,label='PCA')
#pl.scatter(neighbors_n,knn_pca,linewidths=0)
pl.plot(neighbors_n,knn_iso,label='Isomap')
#pl.scatter(neighbors_n,knn_iso,linewidths=0)
pl.plot(neighbors_n,knn_lle,label='LLE')
#pl.scatter(neighbors_n,knn_lle,linewidths=0)
pl.legend(loc='best')
pl.xlabel('# Nearest Neighbors')
pl.ylabel('Fraction conserved')
pl.title('Neighborhood preservation')
pl.xlim(0,max(neighbors_n))
Out[541]:
In [537]:
pl.plot(neighbors_n,knn_tpe,label='Approx. TPE')
#pl.scatter(neighbors_n,knn_tpe,linewidths=0)
pl.plot(neighbors_n,knn_pca,label='PCA')
#pl.scatter(neighbors_n,knn_pca,linewidths=0)
pl.plot(neighbors_n,knn_iso,label='Isomap')
#pl.scatter(neighbors_n,knn_iso,linewidths=0)
pl.plot(neighbors_n,knn_lle,label='LLE')
#pl.scatter(neighbors_n,knn_lle,linewidths=0)
pl.legend(loc='best')
pl.xlabel('# Nearest Neighbors')
pl.ylabel('Fraction conserved')
pl.title('Neighborhood preservation')
pl.xlim(min(neighbors_n),10)
pl.ylim(0,0.2)
Out[537]:
In [299]:
# other idea: what happens if you do this with the MST?
import networkx as nx
from scipy.spatial.distance import squareform,pdist
n=len(X)
ind = sorted(np.arange(n),key=lambda i:Y[i])
d = squareform(pdist(X[ind]))
mds = MDS(dissimilarity='precomputed')
G = nx.Graph(d)
G_ = nx.minimum_spanning_tree(G)
d_dict = nx.all_pairs_shortest_path_length(G_)
d_ = np.zeros((n,n))
for i in range(n):
for j in range(n):
d_[i,j] = d_dict[i][j]
#X_ = mds.fit_transform(d_)
X_ = mds.fit_transform(d_**2)
pl.scatter(X_[:,0],X_[:,1],c=Y[ind],linewidths=0,cmap='rainbow')
Out[299]:
In [ ]:
pl.imshow(d_,interpolation='none')
In [294]:
pl.scatter(X_[:,0],X_[:,1],c=Y[:n],linewidths=0,cmap='rainbow')
pl.colorbar()
pl.title('Multidimensional scaling of shortest-path-distances in MST (Digits)')
Out[294]:
In [295]:
one_nn_class_baseline(X_,Y)
Out[295]:
In [297]:
knn_baseline(X,X_,1)
Out[297]:
In [566]:
from sklearn.neighbors import KernelDensity
kde = KernelDensity()
kde.fit(np.random.randn(10,2)*10)
Out[566]:
In [603]:
level_0 = np.random.randn(5,2)*10
kde = KernelDensity(2.0)
kde.fit(level_0)
level_1 = kde.sample(5*5)
kde = KernelDensity(0.5)
kde.fit(level_1)
level_2 = kde.sample(5*5*5*10)
In [604]:
pl.scatter(level_2[:,0],level_2[:,1],s=10,linewidths=0,alpha=0.2)
pl.scatter(level_1[:,0],level_1[:,1],s=15,linewidths=0,alpha=0.5,c='green')
pl.scatter(level_0[:,0],level_0[:,1],s=20,c='red')
Out[604]:
In [606]:
r = embedding(level_2)
In [609]:
X_ = r[0]
pl.scatter(X_[:,0],X_[:,1],s=20,c=level_2[:,0],linewidths=0)
Out[609]:
In [ ]:
# generate synhetic data with clusters of clusters
def hierarchical_sampling(n_samples=10000,n_dim=2
num_clust_per_level=[5,5]):
cluster_centers = []
level_0 =
kde = KernelDensity()
kde.fit(np.random.randn(num_clust_per_level[0],n_dim)*10)
cluster_centers.append(
for level,num_clust in num_clust_per_level:
kde = KernelDensity()
kde.fit(np.random.randn(num_clust,n_dim)*10)
cluster_centers
In [350]:
from scipy.spatial import KDTree
In [351]:
kdt = KDTree(X)
In [361]:
kdt.
Out[361]:
In [362]:
# construct a hybrid from several linkage
linkages = ['single','complete','ward','average','weighted','centroid','median']
def pairwise_cophenetic_distances(X,linkage='single'):
return hierarchy.cophenet(hierarchy.linkage(X,linkage))
In [365]:
p_l = [pairwise_cophenetic_distances(X,l) for l in linkages]
In [ ]:
In [368]:
p_l = np.array(p_l)
p_l[0],p_l[1],p_l[2],p_l[3]
Out[368]:
In [369]:
[len(set(p)) for p in p_l]
Out[369]:
In [370]:
Out[370]:
In [ ]: